In [ ]:
!pip install pandas
import pandas as pd
!pip install matplotlib
import matplotlib.pyplot as plt
!pip install seaborn
import seaborn as sns
!pip install numpy
import numpy as np
!pip install openpyxl
!pip install nbconvert
In [ ]:
!pip install openpyxl
In [55]:
import pandas as pd
df = pd.read_excel(r"C:\Users\Home\Documents\excel classes\Supermarket Sales Cleaned.xlsx")
In [56]:
df.to_csv("Supermarket Sales Cleaned.csv",index=False)
In [57]:
df.head()
Out[57]:
| Invoice ID | Branch | City | Customer type | Gender | Product line | Unit price | Quantity | Tax 5% | Total | Date | Time | Payment | cogs | gross margin percentage | gross income | Rating | Day | Month | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 750-67-8428 | A | Yangon | Member | Female | Health and beauty | 74.69 | 7 | 26.1415 | 548.9715 | 2019-01-05 | 0.547222 | Ewallet | 522.83 | 4.761905 | 26.1415 | 9.1 | 5 | 1 | 2019 |
| 1 | 226-31-3081 | C | Naypyitaw | Normal | Female | Electronic accessories | 15.28 | 5 | 3.8200 | 80.2200 | 2019-03-08 | 0.436806 | Cash | 76.40 | 4.761905 | 3.8200 | 9.6 | 8 | 3 | 2019 |
| 2 | 631-41-3108 | A | Yangon | Normal | Male | Home and lifestyle | 46.33 | 7 | 16.2155 | 340.5255 | 2019-03-03 | 0.557639 | Credit card | 324.31 | 4.761905 | 16.2155 | 7.4 | 3 | 3 | 2019 |
| 3 | 123-19-1176 | A | Yangon | Member | Male | Health and beauty | 58.22 | 8 | 23.2880 | 489.0480 | 2019-01-27 | 0.856250 | Ewallet | 465.76 | 4.761905 | 23.2880 | 8.4 | 27 | 1 | 2019 |
| 4 | 373-73-7910 | A | Yangon | Normal | Male | Sports and travel | 86.31 | 7 | 30.2085 | 634.3785 | 2019-02-08 | 0.442361 | Ewallet | 604.17 | 4.761905 | 30.2085 | 5.3 | 8 | 2 | 2019 |
In [58]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 500 entries, 0 to 499 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Invoice ID 500 non-null object 1 Branch 500 non-null object 2 City 500 non-null object 3 Customer type 500 non-null object 4 Gender 500 non-null object 5 Product line 500 non-null object 6 Unit price 500 non-null float64 7 Quantity 500 non-null int64 8 Tax 5% 500 non-null float64 9 Total 500 non-null float64 10 Date 500 non-null datetime64[ns] 11 Time 500 non-null float64 12 Payment 500 non-null object 13 cogs 500 non-null float64 14 gross margin percentage 500 non-null float64 15 gross income 500 non-null float64 16 Rating 500 non-null float64 17 Day 500 non-null int64 18 Month 500 non-null int64 19 Year 500 non-null int64 dtypes: datetime64[ns](1), float64(8), int64(4), object(7) memory usage: 78.3+ KB
In [59]:
df.describe()
Out[59]:
| Unit price | Quantity | Tax 5% | Total | Date | Time | cogs | gross margin percentage | gross income | Rating | Day | Month | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.00000 | 500.000000 | 500.0 |
| mean | 54.850140 | 5.692000 | 15.714081 | 329.995701 | 2019-02-15 06:54:43.200000 | 0.646029 | 314.281620 | 4.761905 | 15.714081 | 7.018600 | 14.70400 | 2.056000 | 2019.0 |
| min | 10.590000 | 1.000000 | 0.627000 | 13.167000 | 2019-01-01 00:00:00 | 0.416667 | 12.540000 | 4.761905 | 0.627000 | 4.000000 | 1.00000 | 1.000000 | 2019.0 |
| 25% | 30.560000 | 3.000000 | 6.431750 | 135.066750 | 2019-01-25 18:00:00 | 0.534375 | 128.635000 | 4.761905 | 6.431750 | 5.600000 | 7.00000 | 1.000000 | 2019.0 |
| 50% | 52.425000 | 6.000000 | 12.892500 | 270.742500 | 2019-02-15 00:00:00 | 0.646875 | 257.850000 | 4.761905 | 12.892500 | 7.000000 | 14.00000 | 2.000000 | 2019.0 |
| 75% | 77.772500 | 8.000000 | 22.847875 | 479.805375 | 2019-03-09 00:00:00 | 0.765451 | 456.957500 | 4.761905 | 22.847875 | 8.500000 | 23.00000 | 3.000000 | 2019.0 |
| max | 99.960000 | 10.000000 | 49.650000 | 1042.650000 | 2019-03-30 00:00:00 | 0.874306 | 993.000000 | 4.761905 | 49.650000 | 10.000000 | 31.00000 | 3.000000 | 2019.0 |
| std | 26.848516 | 2.899301 | 11.709972 | 245.909409 | NaN | 0.133405 | 234.199437 | 0.000000 | 11.709972 | 1.719018 | 8.69918 | 0.842784 | 0.0 |
In [60]:
df[df['Unit price'] > 50]
Out[60]:
| Invoice ID | Branch | City | Customer type | Gender | Product line | Unit price | Quantity | Tax 5% | Total | Date | Time | Payment | cogs | gross margin percentage | gross income | Rating | Day | Month | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 750-67-8428 | A | Yangon | Member | Female | Health and beauty | 74.69 | 7 | 26.1415 | 548.9715 | 2019-01-05 | 0.547222 | Ewallet | 522.83 | 4.761905 | 26.1415 | 9.1 | 5 | 1 | 2019 |
| 3 | 123-19-1176 | A | Yangon | Member | Male | Health and beauty | 58.22 | 8 | 23.2880 | 489.0480 | 2019-01-27 | 0.856250 | Ewallet | 465.76 | 4.761905 | 23.2880 | 8.4 | 27 | 1 | 2019 |
| 4 | 373-73-7910 | A | Yangon | Normal | Male | Sports and travel | 86.31 | 7 | 30.2085 | 634.3785 | 2019-02-08 | 0.442361 | Ewallet | 604.17 | 4.761905 | 30.2085 | 5.3 | 8 | 2 | 2019 |
| 5 | 699-14-3026 | C | Naypyitaw | Normal | Male | Electronic accessories | 85.39 | 7 | 29.8865 | 627.6165 | 2019-03-25 | 0.770833 | Ewallet | 597.73 | 4.761905 | 29.8865 | 4.1 | 25 | 3 | 2019 |
| 6 | 355-53-5943 | A | Yangon | Member | Female | Electronic accessories | 68.84 | 6 | 20.6520 | 433.6920 | 2019-02-25 | 0.608333 | Ewallet | 413.04 | 4.761905 | 20.6520 | 5.8 | 25 | 2 | 2019 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 495 | 632-32-4574 | B | Mandalay | Normal | Male | Sports and travel | 75.92 | 8 | 30.3680 | 637.7280 | 2019-03-20 | 0.593056 | Cash | 607.36 | 4.761905 | 30.3680 | 5.5 | 20 | 3 | 2019 |
| 496 | 556-97-7101 | C | Naypyitaw | Normal | Female | Electronic accessories | 63.22 | 2 | 6.3220 | 132.7620 | 2019-01-01 | 0.660417 | Cash | 126.44 | 4.761905 | 6.3220 | 8.5 | 1 | 1 | 2019 |
| 497 | 862-59-8517 | C | Naypyitaw | Normal | Female | Food and beverages | 90.24 | 6 | 27.0720 | 568.5120 | 2019-01-27 | 0.470139 | Cash | 541.44 | 4.761905 | 27.0720 | 6.2 | 27 | 1 | 2019 |
| 498 | 401-18-8016 | B | Mandalay | Member | Female | Sports and travel | 98.13 | 1 | 4.9065 | 103.0365 | 2019-01-21 | 0.733333 | Cash | 98.13 | 4.761905 | 4.9065 | 8.9 | 21 | 1 | 2019 |
| 499 | 420-18-8989 | A | Yangon | Member | Female | Sports and travel | 51.52 | 8 | 20.6080 | 432.7680 | 2019-02-02 | 0.657639 | Cash | 412.16 | 4.761905 | 20.6080 | 9.6 | 2 | 2 | 2019 |
264 rows × 20 columns
In [61]:
print(df['Year'].dtype)
int64
In [63]:
Lineplot:df['cogs'].plot(kind ='line')
In [64]:
Barchart:df['Quantity'].plot(kind = 'bar')
In [65]:
Histogram:df['Unit price'].plot(kind='hist')
In [66]:
import matplotlib.pyplot as plt
In [67]:
df.groupby('Date')['cogs'].sum().plot(kind='line',color="purple")
plt.title("Cost of Goods Sold")
plt.xlabel("Date")
plt.ylabel("Cost")
Out[67]:
Text(0, 0.5, 'Cost')
In [68]:
df.groupby('Month')['Quantity'].sum().plot(kind ="bar",color='blue')
plt.title('Quantity sold over time')
plt.xlabel('Month')
plt.ylabel('Quantity')
Out[68]:
Text(0, 0.5, 'Quantity')
In [69]:
df['Unit price'].plot(kind='hist',color='Lightgreen')
plt.title('Unit price Table')
plt.xlabel('Unit price')
plt.ylabel('Frequency')
Out[69]:
Text(0, 0.5, 'Frequency')
In [ ]:
df.isnull().sum()
In [71]:
df_no_duplicates=df.drop_duplicates('Product line')
In [72]:
df.rename(columns={'cogs':'Cost_of_Goods_Sold'},inplace=True)
print(df.rename)
df.rename(columns={'Cost_of_Goods_Sold':'cogs'},inplace=True)
print(df.rename)
<bound method DataFrame.rename of Invoice ID Branch City Customer type Gender \
0 750-67-8428 A Yangon Member Female
1 226-31-3081 C Naypyitaw Normal Female
2 631-41-3108 A Yangon Normal Male
3 123-19-1176 A Yangon Member Male
4 373-73-7910 A Yangon Normal Male
.. ... ... ... ... ...
495 632-32-4574 B Mandalay Normal Male
496 556-97-7101 C Naypyitaw Normal Female
497 862-59-8517 C Naypyitaw Normal Female
498 401-18-8016 B Mandalay Member Female
499 420-18-8989 A Yangon Member Female
Product line Unit price Quantity Tax 5% Total \
0 Health and beauty 74.69 7 26.1415 548.9715
1 Electronic accessories 15.28 5 3.8200 80.2200
2 Home and lifestyle 46.33 7 16.2155 340.5255
3 Health and beauty 58.22 8 23.2880 489.0480
4 Sports and travel 86.31 7 30.2085 634.3785
.. ... ... ... ... ...
495 Sports and travel 75.92 8 30.3680 637.7280
496 Electronic accessories 63.22 2 6.3220 132.7620
497 Food and beverages 90.24 6 27.0720 568.5120
498 Sports and travel 98.13 1 4.9065 103.0365
499 Sports and travel 51.52 8 20.6080 432.7680
Date Time Payment Cost_of_Goods_Sold \
0 2019-01-05 0.547222 Ewallet 522.83
1 2019-03-08 0.436806 Cash 76.40
2 2019-03-03 0.557639 Credit card 324.31
3 2019-01-27 0.856250 Ewallet 465.76
4 2019-02-08 0.442361 Ewallet 604.17
.. ... ... ... ...
495 2019-03-20 0.593056 Cash 607.36
496 2019-01-01 0.660417 Cash 126.44
497 2019-01-27 0.470139 Cash 541.44
498 2019-01-21 0.733333 Cash 98.13
499 2019-02-02 0.657639 Cash 412.16
gross margin percentage gross income Rating Day Month Year
0 4.761905 26.1415 9.1 5 1 2019
1 4.761905 3.8200 9.6 8 3 2019
2 4.761905 16.2155 7.4 3 3 2019
3 4.761905 23.2880 8.4 27 1 2019
4 4.761905 30.2085 5.3 8 2 2019
.. ... ... ... ... ... ...
495 4.761905 30.3680 5.5 20 3 2019
496 4.761905 6.3220 8.5 1 1 2019
497 4.761905 27.0720 6.2 27 1 2019
498 4.761905 4.9065 8.9 21 1 2019
499 4.761905 20.6080 9.6 2 2 2019
[500 rows x 20 columns]>
<bound method DataFrame.rename of Invoice ID Branch City Customer type Gender \
0 750-67-8428 A Yangon Member Female
1 226-31-3081 C Naypyitaw Normal Female
2 631-41-3108 A Yangon Normal Male
3 123-19-1176 A Yangon Member Male
4 373-73-7910 A Yangon Normal Male
.. ... ... ... ... ...
495 632-32-4574 B Mandalay Normal Male
496 556-97-7101 C Naypyitaw Normal Female
497 862-59-8517 C Naypyitaw Normal Female
498 401-18-8016 B Mandalay Member Female
499 420-18-8989 A Yangon Member Female
Product line Unit price Quantity Tax 5% Total \
0 Health and beauty 74.69 7 26.1415 548.9715
1 Electronic accessories 15.28 5 3.8200 80.2200
2 Home and lifestyle 46.33 7 16.2155 340.5255
3 Health and beauty 58.22 8 23.2880 489.0480
4 Sports and travel 86.31 7 30.2085 634.3785
.. ... ... ... ... ...
495 Sports and travel 75.92 8 30.3680 637.7280
496 Electronic accessories 63.22 2 6.3220 132.7620
497 Food and beverages 90.24 6 27.0720 568.5120
498 Sports and travel 98.13 1 4.9065 103.0365
499 Sports and travel 51.52 8 20.6080 432.7680
Date Time Payment cogs gross margin percentage \
0 2019-01-05 0.547222 Ewallet 522.83 4.761905
1 2019-03-08 0.436806 Cash 76.40 4.761905
2 2019-03-03 0.557639 Credit card 324.31 4.761905
3 2019-01-27 0.856250 Ewallet 465.76 4.761905
4 2019-02-08 0.442361 Ewallet 604.17 4.761905
.. ... ... ... ... ...
495 2019-03-20 0.593056 Cash 607.36 4.761905
496 2019-01-01 0.660417 Cash 126.44 4.761905
497 2019-01-27 0.470139 Cash 541.44 4.761905
498 2019-01-21 0.733333 Cash 98.13 4.761905
499 2019-02-02 0.657639 Cash 412.16 4.761905
gross income Rating Day Month Year
0 26.1415 9.1 5 1 2019
1 3.8200 9.6 8 3 2019
2 16.2155 7.4 3 3 2019
3 23.2880 8.4 27 1 2019
4 30.2085 5.3 8 2 2019
.. ... ... ... ... ...
495 30.3680 5.5 20 3 2019
496 6.3220 8.5 1 1 2019
497 27.0720 6.2 27 1 2019
498 4.9065 8.9 21 1 2019
499 20.6080 9.6 2 2 2019
[500 rows x 20 columns]>
In [73]:
grouped_df=df.groupby('City')['Total'].mean()
print(grouped_df)
City Mandalay 333.602544 Naypyitaw 336.058800 Yangon 320.229692 Name: Total, dtype: float64
In [74]:
agg_df=df.groupby('City').agg({ 'Total':'sum'})
print(agg_df)
agg_df=df.groupby('City').agg({ 'Total':'max'})
print(agg_df)
agg_df=df.groupby('City').agg({ 'Total':'count'})
print(agg_df)
Total
City
Mandalay 52709.2020
Naypyitaw 58810.2900
Yangon 53478.3585
Total
City
Mandalay 944.622
Naypyitaw 1042.650
Yangon 1039.290
Total
City
Mandalay 158
Naypyitaw 175
Yangon 167
In [75]:
joined_df=df[['Customer type','Payment']]
print(joined_df.head())
merged_df=pd.merge(df[['City','Payment','Total']],df[['City','Year','Customer type']], on='City',how='inner')
print(merged_df.head())
Customer type Payment
0 Member Ewallet
1 Normal Cash
2 Normal Credit card
3 Member Ewallet
4 Normal Ewallet
City Payment Total Year Customer type
0 Yangon Ewallet 548.9715 2019 Member
1 Yangon Ewallet 548.9715 2019 Normal
2 Yangon Ewallet 548.9715 2019 Member
3 Yangon Ewallet 548.9715 2019 Normal
4 Yangon Ewallet 548.9715 2019 Member
In [ ]:
concatenated_df=pd.concat([df['City'],df['Total']],axis=1)
print(concatenated_df)
In [ ]:
df_sorted=df.sort_values(by='Unit price',ascending=False)
print(df_sorted)
In [ ]:
df_sorted=df.sort_values(by='Total',ascendin=True)
print(df_sorted)
In [ ]:
filtered_df=df[df['Quantity']>5]
print(filtered_df)
In [ ]:
filtered_df=df[df['cogs']>600]
print(filtered_df)
In [ ]:
#Data visulaization with matplotlib and seaborn
In [77]:
plt.plot(df['City'],df['Quantity'])
plt.title('sales over city')
plt.xlabel('Month')
plt.ylabel('Total')
plt.show()
In [78]:
plt.bar(df['City'],df['Total'],color='purple')
plt.show
Out[78]:
<function matplotlib.pyplot.show(close=None, block=None)>
In [ ]:
import numpy as np
In [ ]:
data=np.random.randn(1000)
plt.hist(data,bins=15,edgecolor='black')
plt.show()
In [79]:
plt.scatter(df['City'],df['Quantity'])
plt.show()
In [ ]:
!pip install seaborn
import seaborn as sns
In [80]:
sns.scatterplot(x='Customer type',y='Quantity',data=df,color='red')
plt.title('scatterplot using seaborn')
plt.show()
In [81]:
sns.boxplot(x='Month',y='Total',data=df,color='brown')
plt.title('Box plot using seaborn')
plt.show()
In [ ]:
numeric_df=df[['Unit price','Total','Quantity','cogs','gross income']]
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix,annot=True,cmap='coolwarm_r')
plt.title('Heatmap using seaaborn')
plt.show()
In [82]:
pivot_table=df.pivot_table(index='City',columns='Product line',values='Quantity',aggfunc='max')
sns.heatmap(pivot_table,annot=True,cmap='cubehelix')
plt.title('Heatmap using seaaborn')
plt.xlabel('Product line')
plt.ylabel('City')
plt.show()
In [83]:
pivot_table=df.pivot_table(index='Product line',columns='City',values='Quantity',aggfunc='count')
sns.heatmap(pivot_table,annot=True,cmap='viridis')
Out[83]:
<Axes: xlabel='City', ylabel='Product line'>
In [84]:
sns.pairplot(df,hue='City')
plt.show()
In [85]:
sns.pairplot(df[['City','Total','Unit price','Quantity']],hue='City')
plt.show()
In [87]:
sns.violinplot(x='City',y='Total',data=df)
plt.title('Total sales distribution by city')
plt.show()
In [ ]:
jupyter nbconvert --to pdf Python Training Session.ipynb
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: